##
## Drew Linzer
## dlinzer@emory.edu
## May 12, 2011
## 
## Linzer-lctab.R
## 
## R code to replicate analyses in "Reliable Inference in Highly 
## Stratified Contingency Tables: Using Latent Class Models as 
## Density Estimators," Political Analysis (2011) 19(2): 173-187.
## Archived at PA Dataverse, study ID hdl:1902.1/15626
## 

library(poLCA)  # requires poLCA version 1.3
library(foreign)
library(nnet)
library(Hmisc)  # for wtd.table and rcorr.cens
setwd("C:/Users/dlinzer/Documents/lctab")

# 2004 National Election Pool Exit Poll
# http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/4181
epoll <- read.spss("04181-0002-Data.sav",to.data.frame=T)
dim(epoll)

names(epoll)

# Presidential vote choice, 2004
table(epoll$PRES04,exclude=NULL)
vote <- rep(NA,nrow(epoll))
vote[epoll$PRES04=="Kerry"] <- "Kerry"
vote[epoll$PRES04=="Bush"] <- "Bush"
vote[epoll$PRES04=="Other" | epoll$PRES04=="Nader"] <- "Other"
table(vote,exclude=NULL)
table(epoll$PRES04,vote,exclude=NULL)

# race
table(epoll$RACE,exclude=NULL)
race <- rep(NA,nrow(epoll))
race[epoll$RACE=="White"] <- "White"
race[epoll$RACE=="Black"] <- "Nonwhite"
race[epoll$RACE=="Hispanic/Latino"] <- "Nonwhite"
race[epoll$RACE=="Asian"] <- "Nonwhite"
race[epoll$RACE=="Other"] <- "Nonwhite"
table(race,exclude=NULL)
table(epoll$RACE,race,exclude=NULL)

# sex
table(epoll$SEX,exclude=NULL)
sex <- rep(NA,nrow(epoll))
sex[epoll$SEX=="Male"] <- "Male"
sex[epoll$SEX=="Female"] <- "Female"
table(sex,exclude=NULL)
table(epoll$SEX,sex,exclude=NULL)

# household income (four categories)
table(epoll$INCOME,exclude=NULL)
income <- rep(NA,nrow(epoll))
income[epoll$INCOME=="Under $15,000"] <- "0to30"
income[epoll$INCOME=="$15,000-$29,999"] <- "0to30"
income[epoll$INCOME=="$30,000-$49,999"] <- "30to50"
income[epoll$INCOME=="$50,000-$74,999"] <- "50to75"
income[epoll$INCOME=="$75,000-$99,999"] <- "75to100"
income[epoll$INCOME=="$100,000-$149,999"] <- "over100"
income[epoll$INCOME=="$150,000-$199,999"] <- "over100"
income[epoll$INCOME=="$200,000 or more"] <- "over100"
table(income,exclude=NULL)
table(epoll$INCOME,income,exclude=NULL)

# age (four categories)
table(epoll$AGE9,exclude=NULL)
age <- rep(NA,nrow(epoll))
age[epoll$AGE9=="18-24"] <- "18to29"
age[epoll$AGE9=="25-29"] <- "18to29"
age[epoll$AGE9=="30-39"] <- "30to44"
age[epoll$AGE9=="40-44"] <- "30to44"
age[epoll$AGE9=="45-49"] <- "45to64"
age[epoll$AGE9=="50-59"] <- "45to64"
age[epoll$AGE9=="60-64"] <- "45to64"
age[epoll$AGE9=="65-74"] <- "65+"
age[epoll$AGE9=="75 or over"] <- "65+"
table(age,exclude=NULL)
table(epoll$AGE9,age,exclude=NULL)

# Marital status
table(epoll$MARRIED,exclude=NULL)
married <- epoll$MARRIED
table(married,exclude=NULL)
table(epoll$MARRIED,married,exclude=NULL)

# Ideology
table(epoll$PHIL3,exclude=NULL)
libcon <- epoll$PHIL3
table(libcon,exclude=NULL)
table(epoll$PHIL3,libcon,exclude=NULL)



# assemble variables into a data frame
dat <- data.frame(vote,race,sex,income,age,married,libcon)
apply(dat,2,table,exclude=NULL)


## FIGURE 1: how LC models bridge the gap between sample and population.

N <- 200
samp <- na.omit(dat[,-1])[sample(c(1:nrow(na.omit(dat[,-1]))),N),]
K.j <- sapply(apply(samp,2,table),length)
fullcell <- expand.grid(sapply(K.j,seq,from=1))
fullcell <- fullcell[do.call(order,data.frame(fullcell)),]

f <- cbind(race,sex,income,age,married,libcon)~1
lc1 <- poLCA(f,samp,nclass=1)
lc2 <- poLCA(f,samp,nclass=2,nrep=5,maxiter=4000)
lc3 <- poLCA(f,samp,nclass=3,nrep=5,maxiter=4000)
lc4 <- poLCA(f,samp,nclass=4,nrep=5,maxiter=4000)

P.true <- ftable(dat[,-1])/sum(ftable(dat[,-1]))
P.hat <- ftable(samp)/sum(ftable(samp))
P.lc1 <- poLCA.predcell(lc1,fullcell)
P.lc2 <- poLCA.predcell(lc2,fullcell)
P.lc3 <- poLCA.predcell(lc3,fullcell)
P.lc4 <- poLCA.predcell(lc4,fullcell)

table(P.hat)/prod(K.j)

windows(6,6)
par(mar=c(4.5,4.5,1,1),las=1)
plot(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.hat)),type="l",lty=2,lwd=3,xlim=c(0,100),ylim=c(0,100),  # one sample
    xlab="Percentile of sorted cell frequency",ylab="Cumulative percent of outcomes",cex.lab=1.5,cex.axis=1.5)
lines(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.lc1)),lwd=2,col="darkgray")
lines(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.lc2)),lwd=2,col="darkgray")
lines(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.lc3)),lwd=2,col="darkgray")
lines(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.lc4)),lwd=2,col="darkgray")
lines(100*c(1:length(P.true))/length(P.true),100*cumsum(sort(P.true)),lwd=3) # population "truth"

text(76,55,"population",cex=1.3)
text(92,12,"sample",cex=1.3)
text(54,26,"1 LC",cex=1.3)
text(61,08,"4 LC",cex=1.3)
#savePlot("cell-cumuldist.pdf",type="pdf")


## FIGURE 2: comparison of MLEs to model-based cell percentage estimates.

windows(5,5)
par(mar=c(4.5,4.5,1,1),las=1)
plot(100*as.vector(ftable(dat[,-1],row.vars=1:6)/sum(ftable(dat[,-1],row.vars=1:6))),
     100*as.vector(ftable(samp,row.vars=1:6)/sum(ftable(samp,row.vars=1:6))),
     xlim=c(0,3),ylim=c(0,3),cex.lab=1.5,cex.axis=1.3,cex=1.2,
     xlab="Population cell percentages",ylab="Observed cell percentages")
abline(0,1)
#savePlot("cell-mle.pdf",type="pdf")

windows(5,5)
par(mar=c(4.5,4.5,1,1),las=1)
plot(100*as.vector(ftable(dat[,-1],row.vars=1:6)/sum(ftable(dat[,-1],row.vars=1:6))),
     100*P.lc2,cex=1.2,
     xlim=c(0,3),ylim=c(0,3),cex.lab=1.5,cex.axis=1.3,
     xlab="Population cell percentages",ylab="Model-based cell estimates")
abline(0,1)
#savePlot("cell-modelbased.pdf",type="pdf")




## Run a simulation.  Target population is males 18-29, white vs. nonwhite

indat <- dat
dat <- data.frame(vote=indat$vote,race=indat$race,age=indat$age,sex=indat$sex)
dat <- na.omit(dat)
ftable(dat)

nvals <- c(50,75,100,150,200,250,300,400,500,600,700,850,1000)
#nvals <- c(50,75,100,150,200,250,300,400,500,600,700,850,1000,1150,1300,1450,1600,1800,2000)

nsim <- 2000
for (N in nvals) {
    vote.mle.w <- vote.mle.b <- NULL
    vote.mlogit.w <- vote.mlogit.b <- NULL
    vote.lc1.w <- vote.lc1.b <- NULL
    vote.lc2.w <- vote.lc2.b <- NULL
    vote.lc3.w <- vote.lc3.b <- NULL
    vote.lc4.w <- vote.lc4.b <- NULL
    for (i in 1:nsim) {
        cat("\n",N,": \t",i,"\n")
        draw <- dat[sample(nrow(dat),N),]
        targ <- subset(draw,draw$sex=="Male" & draw$age=="18to29")
        racetab <- table(targ$race,targ$vote)
        vote.mle.b <- rbind(vote.mle.b,racetab[1,])
        vote.mle.w <- rbind(vote.mle.w,racetab[2,])

        f <- cbind(race,sex,age,vote)~1
        lc1 <- poLCA(f,draw,nclass=1,verbose=F,calc.se=F)
        lc2 <- poLCA(f,draw,nclass=2,verbose=F,calc.se=F)
        lc3 <- poLCA(f,draw,nclass=3,verbose=F,calc.se=F)
        lc4 <- poLCA(f,draw,nclass=4,verbose=F,calc.se=F)
        tab1 <- poLCA.table(race~vote,list(sex=1,age=1),lc1) # = marginal freq
        tab2 <- poLCA.table(race~vote,list(sex=1,age=1),lc2)
        tab3 <- poLCA.table(race~vote,list(sex=1,age=1),lc3)
        tab4 <- poLCA.table(race~vote,list(sex=1,age=1),lc4)
        if (ncol(tab1)==2) {
            tab1 <- cbind(tab1,c(0,0))
            tab2 <- cbind(tab2,c(0,0))
            tab3 <- cbind(tab3,c(0,0))
            tab4 <- cbind(tab4,c(0,0))
        }

        vote.lc1.w <- rbind(vote.lc1.w,tab1[2,])  # white
        vote.lc2.w <- rbind(vote.lc2.w,tab2[2,])
        vote.lc3.w <- rbind(vote.lc3.w,tab3[2,])
        vote.lc4.w <- rbind(vote.lc4.w,tab4[2,])
        vote.lc1.b <- rbind(vote.lc1.b,tab1[1,])  # nonwhite
        vote.lc2.b <- rbind(vote.lc2.b,tab2[1,])
        vote.lc3.b <- rbind(vote.lc3.b,tab3[1,])
        vote.lc4.b <- rbind(vote.lc4.b,tab4[1,])

        if (sum(na.omit(draw)$vote=="Other")==0) {
            vote.mlogit.w <- rbind(vote.mlogit.w,c(NA,NA,NA))
            vote.mlogit.b <- rbind(vote.mlogit.b,c(NA,NA,NA))
        } else {
            mlogit <- multinom(vote~sex*race*age,draw)
            xc.w <- c(1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0)    # white, male, 18-29
            exb2.w <- exp(xc.w %*% coefficients(mlogit)[1,])
            exb3.w <- exp(xc.w %*% coefficients(mlogit)[2,])
            denom.w <- 1+exb2.w+exb3.w
            vote.mlogit.w <- rbind(vote.mlogit.w,c(1/denom.w,exb2.w/denom.w,exb3.w/denom.w))
            xc.b <- c(1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0)    # nonwhite, male, 18-29
            exb2.b <- exp(xc.b %*% coefficients(mlogit)[1,])
            exb3.b <- exp(xc.b %*% coefficients(mlogit)[2,])
            denom.b <- 1+exb2.b+exb3.b
            vote.mlogit.b <- rbind(vote.mlogit.b,c(1/denom.b,exb2.b/denom.b,exb3.b/denom.b))
        }

    }
    res.w <- list(vote.mle.w,vote.lc1.w,vote.lc2.w,vote.lc3.w,vote.lc4.w,vote.mlogit.w)
    res.b <- list(vote.mle.b,vote.lc1.b,vote.lc2.b,vote.lc3.b,vote.lc4.b,vote.mlogit.b)
    dput(res.w,file=paste("resw",N,".put",sep=""))
    dput(res.b,file=paste("resb",N,".put",sep=""))
}

res.w <- list()
res.b <- list()
nvals <- c(50,75,100,150,200,250,300,400,500,600,700,850,1000)
for (n in 1:length(nvals)) {
    res.w[[n]] <- dget(paste("resw",nvals[n],".put",sep=""))
    res.b[[n]] <- dget(paste("resb",nvals[n],".put",sep=""))
}


## FIGURE 3: plot RMSE at different sample sizes for each estimator.

# 835 observed white-male-young respondents with a recorded vote choice.
sel.w <- subset(dat,dat$race=="White" & dat$sex=="Male" & dat$age=="18to29")
table(sel.w$vote,exclude=NULL)
sum(table(sel.w$vote))
table(sel.w$vote)/sum(table(sel.w$vote))

# 309 observed nonwhite-male-young respondents with a recorded vote choice.
sel.b <- subset(dat,dat$race=="Nonwhite" & dat$sex=="Male" & dat$age=="18to29")
table(sel.b$vote,exclude=NULL)
sum(table(sel.b$vote))
table(sel.b$vote)/sum(table(sel.b$vote))


# Conditional probability: Kerry vote share estimate for young male whites
windows(5,5)
par(mar=c(4.2,4.2,0.5,0.7),las=1)
ymax <- 0.5
resmat <- NULL
actual <- (table(sel.w$vote)/sum(table(sel.w$vote)))[2]
for (i in 1:length(res.w)) {
    resmat <- rbind(resmat,c(sqrt(mean((res.w[[i]][[1]][,2]/rowSums(res.w[[i]][[1]])-actual)^2,na.rm=T)), # MLE
                             sqrt(mean((res.w[[i]][[2]][,2]/rowSums(res.w[[i]][[2]])-actual)^2,na.rm=T)), # 1 LC
                             sqrt(mean((res.w[[i]][[3]][,2]/rowSums(res.w[[i]][[3]])-actual)^2,na.rm=T)), # 2 LC
                             sqrt(mean((res.w[[i]][[4]][,2]/rowSums(res.w[[i]][[4]])-actual)^2,na.rm=T)), # 3 LC
                             sqrt(mean((res.w[[i]][[5]][,2]/rowSums(res.w[[i]][[5]])-actual)^2,na.rm=T)), # 4 LC
                             sqrt(mean((res.w[[i]][[6]][,2]-actual)^2,na.rm=T)))) # mlogit
}
matplot(nvals,resmat,type="l",col=1,cex.axis=1.3,cex.lab=1.5,cex.main=1.5,
        lty=c(1,3,4,2,5,1),lwd=2,xlab="Sample size",ylab="RMSE",xlim=c(-90,max(nvals)),ylim=c(0,ymax))
text(50,resmat[1,1:5],c("MLE\n GLM","1 LC","2 LC","3 LC","4 LC"),cex=1.2,pos=2)
#savePlot("rmse-white.pdf",type="pdf")

# Conditional probability: Kerry vote share estimate for young male nonwhites
windows(5,5)
par(mar=c(4.2,4.2,0.5,0.5),las=1)
ymax <- 0.5
resmat <- NULL
actual <- (table(sel.b$vote)/sum(table(sel.b$vote)))[2]
for (i in 1:length(res.b)) {
    resmat <- rbind(resmat,c(sqrt(mean((res.b[[i]][[1]][,2]/rowSums(res.b[[i]][[1]])-actual)^2,na.rm=T)), # MLE
                             sqrt(mean((res.b[[i]][[2]][,2]/rowSums(res.b[[i]][[2]])-actual)^2,na.rm=T)), # 1 LC
                             sqrt(mean((res.b[[i]][[3]][,2]/rowSums(res.b[[i]][[3]])-actual)^2,na.rm=T)), # 2 LC
                             sqrt(mean((res.b[[i]][[4]][,2]/rowSums(res.b[[i]][[4]])-actual)^2,na.rm=T)), # 3 LC
                             sqrt(mean((res.b[[i]][[5]][,2]/rowSums(res.b[[i]][[5]])-actual)^2,na.rm=T)), # 4 LC
                             sqrt(mean((res.b[[i]][[6]][,2]-actual)^2,na.rm=T)))) # mlogit
}
matplot(nvals,resmat,type="l",col=1,cex.axis=1.3,cex.lab=1.5,cex.main=1.5,
        lty=c(1,3,4,2,5,1),lwd=2,xlab="Sample size",ylab="RMSE",xlim=c(-90,max(nvals)),ylim=c(0,ymax))
text(45,resmat[1,1:6],c("MLE","1 LC","2 LC","3 LC","4 LC","GLM"),cex=1.2,pos=2)
#savePlot("rmse-nonwhite.pdf",type="pdf")

# Marginal effect of race on Kerry vote share for young males
windows(5,5)
par(mar=c(4.2,4.2,0.5,0.5),las=1)
ymax <- 0.5
resmat <- NULL
actual <- (table(sel.b$vote)/sum(table(sel.b$vote))-table(sel.w$vote)/sum(table(sel.w$vote)))[2]
for (i in 1:length(res.b)) {
    resmat <- rbind(resmat,c(sqrt(mean((res.b[[i]][[1]][,2]/rowSums(res.b[[i]][[1]])-
                                        res.w[[i]][[1]][,2]/rowSums(res.w[[i]][[1]])-actual)^2,na.rm=T)), # MLE
                             # for 1 LC, the estimated marginal effect is always 0:
                             sqrt(mean((res.b[[i]][[2]][,2]/rowSums(res.b[[i]][[2]])-
                                        res.w[[i]][[2]][,2]/rowSums(res.w[[i]][[2]])-actual)^2,na.rm=T)), # 1 LC
                             sqrt(mean((res.b[[i]][[3]][,2]/rowSums(res.b[[i]][[3]])-
                                        res.w[[i]][[3]][,2]/rowSums(res.w[[i]][[3]])-actual)^2,na.rm=T)), # 2 LC
                             sqrt(mean((res.b[[i]][[4]][,2]/rowSums(res.b[[i]][[4]])-
                                        res.w[[i]][[4]][,2]/rowSums(res.w[[i]][[4]])-actual)^2,na.rm=T)), # 3 LC
                             sqrt(mean((res.b[[i]][[5]][,2]/rowSums(res.b[[i]][[5]])-
                                        res.w[[i]][[5]][,2]/rowSums(res.w[[i]][[5]])-actual)^2,na.rm=T)), # 4 LC
                             sqrt(mean((res.b[[i]][[6]][,2]-res.w[[i]][[6]][,2]-actual)^2,na.rm=T)))) # mlogit
}
matplot(nvals,resmat,type="l",col=1,cex.axis=1.3,cex.lab=1.5,cex.main=1.5,
        lty=c(1,3,4,2,5,1),lwd=2,xlab="Sample size",ylab="RMSE",xlim=c(-90,max(nvals)),ylim=c(0,ymax))
text(45,c(0.48,resmat[1,2:5]),c("MLE\n GLM","1 LC","2 LC","3 LC","4 LC"),cex=1.2,pos=2)
#savePlot("rmse-raceeff.pdf",type="pdf")



# Extract sets of simulation results for N=200

idx <- 5

vote.mle.w       <- res.w[[idx]][[1]]
vote.lc1.w       <- res.w[[idx]][[2]]
vote.lc2.w       <- res.w[[idx]][[3]]
vote.lc3.w       <- res.w[[idx]][[4]]
vote.lc4.w       <- res.w[[idx]][[5]]
vote.mlogit.w    <- res.w[[idx]][[6]]

vote.mle.b       <- res.b[[idx]][[1]]
vote.lc1.b       <- res.b[[idx]][[2]]
vote.lc2.b       <- res.b[[idx]][[3]]
vote.lc3.b       <- res.b[[idx]][[4]]
vote.lc4.b       <- res.b[[idx]][[5]]
vote.mlogit.b    <- res.b[[idx]][[6]]


bvplot <-function(vec,pch) {
    segments(min(vec,na.rm=T),var(vec,na.rm=T),max(vec,na.rm=T),var(vec,na.rm=T))
    segments(quantile(vec,0.10,na.rm=T),var(vec,na.rm=T),quantile(vec,0.90,na.rm=T),var(vec,na.rm=T),lwd=3)
    points(mean(vec,na.rm=T),var(vec,na.rm=T),pch=pch,cex=1.4,bg="white",lwd=2)
}

## FIGURE 4: plot estimate (bias) vs. variance.

# Whites
windows(5,5)
par(mar=c(4.2,4,0.5,0.5))
ymax <- 0.09
plot(0,0,col="white",xlim=c(0,1),ylim=c(0,ymax),xlab="Estimate of Kerry vote share",ylab="Variance",cex.lab=1.5,cex.axis=1.3)
bvplot(vote.mle.w[,2]/rowSums(vote.mle.w),19)
bvplot(vote.lc2.w[,2]/rowSums(vote.lc2.w),21)
bvplot(vote.lc3.w[,2]/rowSums(vote.lc3.w),22)
bvplot(vote.lc4.w[,2]/rowSums(vote.lc4.w),24)
abline(v=(table(sel.w$vote)/sum(table(sel.w$vote)))[2],lty=3)
text((table(sel.w$vote)/sum(table(sel.w$vote)))[2],ymax,"true value",pos=4,cex=1.2)
legend(-0.04,0.086,c("MLE","4 latent classes","3 latent classes","2 latent classes"),pch=c(19,24,22,21),bty="n",cex=1.2,pt.lwd=2)
#savePlot("bv-white.pdf",type="pdf")

# Non-whites
windows(5,5)
par(mar=c(4.2,4,0.5,0.5))
ymax <- 0.09
plot(0,0,col="white",xlim=c(0,1),ylim=c(0,ymax),xlab="Estimate of Kerry vote share",ylab="Variance",cex.lab=1.5,cex.axis=1.3)
bvplot(vote.mle.b[,2]/rowSums(vote.mle.b),19)
bvplot(vote.lc2.b[,2]/rowSums(vote.lc2.b),21)
bvplot(vote.lc3.b[,2]/rowSums(vote.lc3.b),22)
bvplot(vote.lc4.b[,2]/rowSums(vote.lc4.b),24)
abline(v=(table(sel.b$vote)/sum(table(sel.b$vote)))[2],lty=3)
text((table(sel.b$vote)/sum(table(sel.b$vote)))[2],ymax,"true value",pos=2,cex=1.2)
#savePlot("bv-nonwhite.pdf",type="pdf")

# Effect of race on vote choice for young males
windows(5,5)
par(mar=c(4.2,4,0.5,0.5))
ymax <- 0.09
plot(0,0,col="white",xlim=c(-1,1),ylim=c(0,ymax),xlab="Estimated change in Kerry vote share",ylab="Variance",cex.lab=1.5,cex.axis=1.3)
bvplot(vote.mle.b[,2]/rowSums(vote.mle.b)-vote.mle.w[,2]/rowSums(vote.mle.w),19)
bvplot(vote.lc2.b[,2]/rowSums(vote.lc2.b)-vote.lc2.w[,2]/rowSums(vote.lc2.w),21)
bvplot(vote.lc3.b[,2]/rowSums(vote.lc3.b)-vote.lc3.w[,2]/rowSums(vote.lc3.w),22)
bvplot(vote.lc4.b[,2]/rowSums(vote.lc4.b)-vote.lc4.w[,2]/rowSums(vote.lc4.w),24)
abline(v=(table(sel.b$vote)/sum(table(sel.b$vote))-table(sel.w$vote)/sum(table(sel.w$vote)))[2],lty=3)
text((table(sel.b$vote)/sum(table(sel.b$vote))-table(sel.w$vote)/sum(table(sel.w$vote)))[2],ymax,"true value",pos=4,cex=1.2)
#savePlot("bv-raceeff.pdf",type="pdf")






##########################################################
## Application: Election forecasting in small subgroups ##
##########################################################

## Analyze October 2008 CBS News polls.

## Compare to 2008 exit poll estimates.
## Source: http://www.ropercenter.uconn.edu
ep08 <- read.spss("usmi2008-natelec.por",to.data.frame=T)
ep08$PRSUS08[ep08$PRSUS08=="Did not vote"] <- NA
ep08$INCOME07[ep08$INCOME07=="$200,000 or more"] <- "$100,000-$149,999"
ep08$INCOME07[ep08$INCOME07=="$150,000-$199,999"] <- "$100,000-$149,999"

wmi.ep <- subset(ep08,ep08$SEX=="Male" & ep08$RACE=="White" & ep08$PARTYID=="Independent")
nrow(wmi.ep) # 1484

wm.ep <- subset(ep08,ep08$SEX=="Male" & ep08$RACE=="White")
nrow(wm.ep) # 5502

wtab <- function(dat,rowpct=T) {
    inclevels <- c("Under $15,000","$15,000-$29,999","$30,000-$49,999",
                   "$50,000-$74,999","$75,000-$99,999","$100,000-$149,999")
    inctab <- NULL
    for (i in 1:length(inclevels)) {
        incsub <- subset(dat,dat$INCOME07==inclevels[i])
        inctab <- rbind(inctab,wtd.table(incsub$PRSUS08,weights=incsub$WEIGHT,type="table"))
    }
    rownames(inctab) <- inclevels
    if (rowpct) {
        ret <- inctab/rowSums(inctab)
    } else {
        ret <- inctab
    }
    return(ret)
}

wtab(ep08)
wtab(wm.ep)
wtab(wm.ep,rowpct=F)



## CBS news polls
## http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/26826
## http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/26827
## http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/26832
## http://www.icpsr.umich.edu/icpsrweb/ICPSR/studies/26828
cbs2 <- read.spss("26826-0001-Data.sav",to.data.frame=T)
cbs3 <- read.spss("26827-0001-Data.sav",to.data.frame=T)
cbs4 <- read.spss("26832-0001-Data.sav",to.data.frame=T)
cbs5 <- read.spss("26828-0001-Data.sav",to.data.frame=T)

recodes <- function(dat) {
    dat$Q5[dat$Q5=="Depends (Vol.)/Undecided"] <- "Other (Vol.)"
    dat$Q5[dat$Q5=="Refused"] <- NA
    dat$Q5[dat$Q5=="Won't vote (Vol.)"] <- NA
    dat$RACE[dat$RACE=="Refused"] <- NA
    dat$INCA[dat$INCA=="Won't specify/Refused"] <- NA
    dat$INCN[dat$INCN=="Won't specify/Refused"] <- NA
    return(dat)
}
cbs2 <- recodes(cbs2)
cbs3 <- recodes(cbs3)
cbs4 <- recodes(cbs4)
cbs5 <- recodes(cbs5)

nrow(cbs2)  # N=833
nrow(cbs3)  # N=1390
nrow(cbs4)  # N=1167
nrow(cbs5)  # N=1051

table(cbs2$RACE)  # N=716 
table(cbs3$RACE)  # N=1192
table(cbs4$RACE)  # N=995 
table(cbs5$RACE)  # N=876 

table(cbs2$SEX[cbs2$RACE=="White or Caucasian"])  # N=293
table(cbs3$SEX[cbs3$RACE=="White or Caucasian"])  # N=488
table(cbs4$SEX[cbs4$RACE=="White or Caucasian"])  # N=402
table(cbs5$SEX[cbs5$RACE=="White or Caucasian"])  # N=357

table(cbs2$PRTY[cbs2$RACE=="White or Caucasian" & cbs2$SEX=="Male"])  # N=105
table(cbs3$PRTY[cbs3$RACE=="White or Caucasian" & cbs3$SEX=="Male"])  # N=169
table(cbs4$PRTY[cbs4$RACE=="White or Caucasian" & cbs4$SEX=="Male"])  # N=141
table(cbs5$PRTY[cbs5$RACE=="White or Caucasian" & cbs5$SEX=="Male"])  # N=126


## FIGURE 5 (a): White males, effect of income on vote choice.

wm2 <- subset(cbs2,cbs2$RACE=="White or Caucasian" & cbs2$SEX=="Male")
wm3 <- subset(cbs3,cbs3$RACE=="White or Caucasian" & cbs3$SEX=="Male")
wm4 <- subset(cbs4,cbs4$RACE=="White or Caucasian" & cbs4$SEX=="Male")
wm5 <- subset(cbs5,cbs5$RACE=="White or Caucasian" & cbs5$SEX=="Male")

pctOB2 <- (table(wm2$Q5,wm2$INCN)[1,]/colSums(table(wm2$Q5,wm2$INCN)))[1:6] # effect of income
pctOB3 <- (table(wm3$Q5,wm3$INCN)[1,]/colSums(table(wm3$Q5,wm3$INCN)))[1:6]
pctOB4 <- (table(wm4$Q5,wm4$INCN)[1,]/colSums(table(wm4$Q5,wm4$INCN)))[1:6]
pctOB5 <- (table(wm5$Q5,wm5$INCN)[1,]/colSums(table(wm5$Q5,wm5$INCN)))[1:6]
rawpct <- rbind(pctOB2,pctOB3,pctOB4,pctOB5)
rawpct
apply(apply(rawpct,2,range),2,diff)

Nmat <- rbind(colSums(table(wm2$Q5,wm2$INCN)),
              colSums(table(wm3$Q5,wm3$INCN)),
              colSums(table(wm4$Q5,wm4$INCN)),
              colSums(table(wm5$Q5,wm5$INCN)))
Nmat <- Nmat[,1:6]

# Latent class model-based estimation:
f <- cbind(RACE,SEX,INCN,Q5)~1
lc2 <- poLCA(f,cbs2,nclass=2,nrep=3)
lc3 <- poLCA(f,cbs3,nclass=2,nrep=3)
lc4 <- poLCA(f,cbs4,nclass=2,nrep=3)
lc5 <- poLCA(f,cbs5,nclass=2,nrep=3)

lctab2 <- poLCA.table(Q5~INCN,condition=list(RACE=1,SEX=1),lc2)
lctab3 <- poLCA.table(Q5~INCN,condition=list(RACE=1,SEX=1),lc3)
lctab4 <- poLCA.table(Q5~INCN,condition=list(RACE=1,SEX=1),lc4)
lctab5 <- poLCA.table(Q5~INCN,condition=list(RACE=1,SEX=1),lc5)

lcOB2 <- lctab2[1,]/colSums(lctab2)
lcOB3 <- lctab3[1,]/colSums(lctab3)
lcOB4 <- lctab4[1,]/colSums(lctab4)
lcOB5 <- lctab5[1,]/colSums(lctab5)
lcpct <- rbind(lcOB2,lcOB3,lcOB4,lcOB5)


# one plot per wave, show effect of income and stability across waves
windows(10,2.4)
par(mar=c(2.8,4.5,2,0.5),las=1,mfrow=c(1,nrow(rawpct)))
polldate <- c("10/28-30 poll","10/29-31 poll","10/29-11/1 poll","10/31-11/2 poll")
for (i in 1:nrow(rawpct)) {
    plot(c(1:ncol(rawpct)),100*rawpct[i,],xaxt="n",xlab="",ylim=c(0,100),cex.axis=1.2,cex.lab=1.4,cex=1.4,
            type="b",ylab="Percent voting Obama",col="gray60",lwd=2,main=polldate[i])
    text(c(1:ncol(rawpct)),100*rawpct[i,]+7,Nmat[i,])
    lines(c(1:ncol(rawpct)),100*lcpct[i,],type="b",lwd=2,pch=15)
    lines(c(1:ncol(rawpct)),100*wtab(wm.ep)[,1],lty=2)
    axis(1,c(1:ncol(rawpct)),rep("",ncol(rawpct)))
    axis(1,c(2,ncol(rawpct)-1),c("low income    ","   high income"),cex.axis=1.2)
}
#savePlot("inc-wm.pdf",type="pdf")


## FIGURE 5 (b): White male independents, effect of income on vote choice.

wmi2 <- subset(cbs2,cbs2$RACE=="White or Caucasian" & cbs2$SEX=="Male" & cbs2$PRTY=="Independent")
wmi3 <- subset(cbs3,cbs3$RACE=="White or Caucasian" & cbs3$SEX=="Male" & cbs3$PRTY=="Independent")
wmi4 <- subset(cbs4,cbs4$RACE=="White or Caucasian" & cbs4$SEX=="Male" & cbs4$PRTY=="Independent")
wmi5 <- subset(cbs5,cbs5$RACE=="White or Caucasian" & cbs5$SEX=="Male" & cbs5$PRTY=="Independent")

pctOB2 <- (table(wmi2$Q5,wmi2$INCN)[1,]/colSums(table(wmi2$Q5,wmi2$INCN)))[1:6] # effect of income
pctOB3 <- (table(wmi3$Q5,wmi3$INCN)[1,]/colSums(table(wmi3$Q5,wmi3$INCN)))[1:6]
pctOB4 <- (table(wmi4$Q5,wmi4$INCN)[1,]/colSums(table(wmi4$Q5,wmi4$INCN)))[1:6]
pctOB5 <- (table(wmi5$Q5,wmi5$INCN)[1,]/colSums(table(wmi5$Q5,wmi5$INCN)))[1:6]
rawpct <- rbind(pctOB2,pctOB3,pctOB4,pctOB5)
rawpct
apply(apply(rawpct,2,range),2,diff)

Nmat <- rbind(colSums(table(wmi2$Q5,wmi2$INCN)),
                colSums(table(wmi3$Q5,wmi3$INCN)),
                colSums(table(wmi4$Q5,wmi4$INCN)),
                colSums(table(wmi5$Q5,wmi5$INCN)))
Nmat <- Nmat[,1:6]

# Latent class model-based estimation:
f <- cbind(SEX,RACE,PRTY,INCN,Q5)~1
lc2 <- poLCA(f,cbs2,nclass=2,nrep=3)
lc3 <- poLCA(f,cbs3,nclass=2,nrep=3)
lc4 <- poLCA(f,cbs4,nclass=2,nrep=3)
lc5 <- poLCA(f,cbs5,nclass=2,nrep=3)

lctab2 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3),lc2)
lctab3 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3),lc3)
lctab4 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3),lc4)
lctab5 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3),lc5)

lcOB2 <- lctab2[1,]/colSums(lctab2)
lcOB3 <- lctab3[1,]/colSums(lctab3)
lcOB4 <- lctab4[1,]/colSums(lctab4)
lcOB5 <- lctab5[1,]/colSums(lctab5)
lcpct <- rbind(lcOB2,lcOB3,lcOB4,lcOB5)

# one plot per wave, show effect of income and stability across waves
windows(10,2.4)
par(mar=c(2.8,4.5,2,0.5),las=1,mfrow=c(1,nrow(rawpct)))
polldate <- c("10/28-30 poll","10/29-31 poll","10/29-11/1 poll","10/31-11/2 poll")
for (i in 1:nrow(rawpct)) {
    plot(c(1:ncol(rawpct)),100*rawpct[i,],xaxt="n",xlab="",ylim=c(0,100),cex.axis=1.2,cex.lab=1.4,cex=1.4,
            type="b",ylab="Percent voting Obama",col="gray60",lwd=2,main=polldate[i])
    text(c(1:ncol(rawpct)),100*rawpct[i,]+7,Nmat[i,])
    lines(c(1:ncol(rawpct)),100*lcpct[i,],type="b",lwd=2,pch=15)
    lines(c(1:ncol(rawpct)),100*wtab(wm.ep)[,1],lty=2)
    axis(1,c(1:ncol(rawpct)),rep("",ncol(rawpct)))
    axis(1,c(2,ncol(rawpct)-1),c("low income    ","   high income"),cex.axis=1.2)
}
#savePlot("inc-wmi.pdf",type="pdf")


## FIGURE 5 (c): White male independents aged 64+, effect of income on vote choice.

wmio2 <- subset(cbs2,cbs2$RACE=="White or Caucasian" & cbs2$SEX=="Male" & cbs2$PRTY=="Independent" & cbs2$AGEA=="Over 64")
wmio3 <- subset(cbs3,cbs3$RACE=="White or Caucasian" & cbs3$SEX=="Male" & cbs3$PRTY=="Independent" & cbs3$AGEA=="Over 64")
wmio4 <- subset(cbs4,cbs4$RACE=="White or Caucasian" & cbs4$SEX=="Male" & cbs4$PRTY=="Independent" & cbs4$AGEA=="Over 64")
wmio5 <- subset(cbs5,cbs5$RACE=="White or Caucasian" & cbs5$SEX=="Male" & cbs5$PRTY=="Independent" & cbs5$AGEA=="Over 64")

pctOB2 <- (table(wmio2$Q5,wmio2$INCN)[1,]/colSums(table(wmio2$Q5,wmio2$INCN)))[1:6] # effect of income
pctOB3 <- (table(wmio3$Q5,wmio3$INCN)[1,]/colSums(table(wmio3$Q5,wmio3$INCN)))[1:6]
pctOB4 <- (table(wmio4$Q5,wmio4$INCN)[1,]/colSums(table(wmio4$Q5,wmio4$INCN)))[1:6]
pctOB5 <- (table(wmio5$Q5,wmio5$INCN)[1,]/colSums(table(wmio5$Q5,wmio5$INCN)))[1:6]
rawpct <- rbind(pctOB2,pctOB3,pctOB4,pctOB5)
rawpct

Nmat <- rbind(colSums(table(wmio2$Q5,wmio2$INCN)),
              colSums(table(wmio3$Q5,wmio3$INCN)),
              colSums(table(wmio4$Q5,wmio4$INCN)),
              colSums(table(wmio5$Q5,wmio5$INCN)))
Nmat <- Nmat[,1:6]

# Latent class model-based estimation:
f <- cbind(SEX,RACE,PRTY,AGEA,INCN,Q5)~1
lc2 <- poLCA(f,cbs2,nclass=2,nrep=3)
lc3 <- poLCA(f,cbs3,nclass=2,nrep=3)
lc4 <- poLCA(f,cbs4,nclass=2,nrep=3)
lc5 <- poLCA(f,cbs5,nclass=2,nrep=3)

lctab2 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3,AGEA=4),lc2)
lctab3 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3,AGEA=4),lc3)
lctab4 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3,AGEA=4),lc4)
lctab5 <- poLCA.table(Q5~INCN,condition=list(SEX=1,RACE=1,PRTY=3,AGEA=4),lc5)

lcOB2 <- lctab2[1,]/colSums(lctab2)
lcOB3 <- lctab3[1,]/colSums(lctab3)
lcOB4 <- lctab4[1,]/colSums(lctab4)
lcOB5 <- lctab5[1,]/colSums(lctab5)
lcpct <- rbind(lcOB2,lcOB3,lcOB4,lcOB5)


# one plot per wave, show effect of income and stability across waves
windows(10,2.4)
par(mar=c(2.8,4.5,2,0.5),las=1,mfrow=c(1,nrow(rawpct)))
polldate <- c("10/28-30 poll","10/29-31 poll","10/29-11/1 poll","10/31-11/2 poll")
for (i in 1:nrow(rawpct)) {
    plot(c(1:ncol(rawpct)),100*rawpct[i,],xaxt="n",xlab="",ylim=c(0,100),cex.axis=1.2,cex.lab=1.4,cex=1.4,
            type="b",ylab="Percent voting Obama",col="gray60",lwd=2,main=polldate[i])
    text(c(1:ncol(rawpct)),100*rawpct[i,]+7,Nmat[i,])
    lines(c(1:ncol(rawpct)),100*lcpct[i,],type="b",lwd=2,pch=15)
    lines(c(1:ncol(rawpct)),100*wtab(wm.ep)[,1],lty=2)
    axis(1,c(1:ncol(rawpct)),rep("",ncol(rawpct)))
    axis(1,c(2,ncol(rawpct)-1),c("low income    ","   high income"),cex.axis=1.2)
}
text(1,100*rawpct[4,1]-7,Nmat[4,1])
text(5,100*rawpct[4,5]-7,Nmat[4,5])
#savePlot("inc-wmio.pdf",type="pdf")




## end of file.
